Final Project Rmarkdown File

Group Members: Adam Gonzalez, Jon Le (4pm section), Erin Lee, Debbie Lu & Corinne Smith

NOTE: echo = FALSE parameter is added to the code chunk to prevent printing of the R code that generated the plot.

1) CLEAR ENVIRONMENT

# code to remove objects in Environment before knitting
rm(list = ls())

2) LOAD THE DATA

books <- read.csv(here::here("datasets", "books.csv"))
books_two <- read.csv(here::here("datasets", "books_two.csv"))

3) LOAD LIBRARIES

# ---------------------------------------
library('yardstick')
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
# ---------------------------------------

# ---------------------------------------
# data visualization
# --------------------------------------
library('ggplot2')
library('plotly')
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library('gganimate')
library('ggridges')

# ---------------------------------------
# data manipulation
# --------------------------------------
library('forcats')
library('tidyverse')
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.3     ✓ purrr   0.3.4
## ✓ tidyr   1.1.2     ✓ dplyr   1.0.2
## ✓ readr   1.3.1     ✓ stringr 1.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
## x readr::spec()   masks yardstick::spec()
library('magrittr')
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library('lubridate')
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library('dplyr')
library('DT')
#install.packages("formattable")
#install.packages("tidyr")
library('formattable')
## 
## Attaching package: 'formattable'
## The following object is masked from 'package:plotly':
## 
##     style
library('tidyr')
library('data.table')
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library('kableExtra')
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# ---------------------------------------
# sentiment analysis
# --------------------------------------
library('sentimentr')
## 
## Attaching package: 'sentimentr'
## The following object is masked from 'package:plotly':
## 
##     highlight
# ---------------------------------------
# summary statistics
# --------------------------------------
#install.packages("qwraps2")
library("qwraps2")

# ---------------------------------------
# model validation library
# ---------------------------------------
library('rsample')

# ---------------------------------------
# generalized linear model libraries
# ---------------------------------------
library('glmnet')
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.0-2
library('glmnetUtils')
## 
## Attaching package: 'glmnetUtils'
## The following objects are masked from 'package:glmnet':
## 
##     cv.glmnet, glmnet
# ---------------------------------------
# regression output
# ---------------------------------------
# install.packages('sjPlot')
library('sjPlot')
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
# install.packages('sjPlot')
library('tidymodels')
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.1 ──
## ✓ broom     0.7.0      ✓ parsnip   0.1.3 
## ✓ dials     0.0.9      ✓ recipes   0.1.13
## ✓ infer     0.5.3      ✓ tune      0.1.1 
## ✓ modeldata 0.0.2      ✓ workflows 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
## x data.table::between()    masks dplyr::between()
## x scales::discard()        masks purrr::discard()
## x Matrix::expand()         masks tidyr::expand()
## x magrittr::extract()      masks tidyr::extract()
## x dplyr::filter()          masks plotly::filter(), stats::filter()
## x data.table::first()      masks dplyr::first()
## x recipes::fixed()         masks stringr::fixed()
## x kableExtra::group_rows() masks dplyr::group_rows()
## x dplyr::lag()             masks stats::lag()
## x data.table::last()       masks dplyr::last()
## x Matrix::pack()           masks tidyr::pack()
## x magrittr::set_names()    masks purrr::set_names()
## x readr::spec()            masks yardstick::spec()
## x recipes::step()          masks stats::step()
## x data.table::transpose()  masks purrr::transpose()
## x Matrix::unpack()         masks tidyr::unpack()
# ---------------------------------------
# random forest libraries
# ---------------------------------------
library('partykit')
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
#library('tidyverse')
library('PerformanceAnalytics')
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library('rpart')      
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
library('rpart.plot')  
library('randomForest')
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
#install.packages("randomForestExplainer")
library('randomForestExplainer')
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# ---------------------------------------
# lasso libraries
# ---------------------------------------
library('broom')
library('coefplot')
## 
## Attaching package: 'coefplot'
## The following object is masked from 'package:qwraps2':
## 
##     invlogit
# ---------------------------------------
books <- books %>% rename(avg_book_rating = average_rating,
                          book_ratings_count = ratings_count,
                          author = authors)
books_two <- books_two %>% rename(author = authors,
                                  authorworkcount = workcount,
                                  author_fans = fan_count,
                                  avg_author_rating = average_rate,
                                  author_ratings_count = rating_count,
                                  author_review_count = review_count,
                                  )

4) SENTIMENT ANALYSIS

Need ‘sentimentr’ library.

sentiment_DF <- get_sentences(books$title) %>% sentiment_by(books$title)

5) MERGING

head(sentiment_DF)
##                                                                                                                            title
## 1:                                                                                                                              
## 2:                                                                                                 said the shotgun to the head.
## 3: $30 Film School: How to Write  Direct  Produce  Shoot  Edit  Distribute  Tour With  and Sell Your Own No-Budget Digital Movie
## 4:                                                                                                                  'Salem's Lot
## 5:                                                                                            1 000 Places to See Before You Die
## 6:                                                                                                                 10 lb Penalty
##    word_count sd ave_sentiment
## 1:          0  0     0.0000000
## 2:          6 NA    -0.1632993
## 3:         20 NA    -0.1118034
## 4:         16  0     0.0000000
## 5:          6 NA    -0.3061862
## 6:          2 NA    -0.5303301
books_s <- inner_join(x = books,
                 y = sentiment_DF,
                 by = "title")
head(books_s)
##   bookID
## 1      1
## 2      2
## 3      4
## 4      5
## 5      8
## 6      9
##                                                                                   title
## 1                             Harry Potter and the Half-Blood Prince (Harry Potter  #6)
## 2                          Harry Potter and the Order of the Phoenix (Harry Potter  #5)
## 3                            Harry Potter and the Chamber of Secrets (Harry Potter  #2)
## 4                           Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)
## 5                                Harry Potter Boxed Set  Books 1-5 (Harry Potter  #1-5)
## 6 Unauthorized Harry Potter Book Seven News: Half-Blood Prince Analysis and Speculation
##                       author avg_book_rating       isbn        isbn13
## 1 J.K. Rowling/Mary GrandPré            4.57 0439785960 9780439785969
## 2 J.K. Rowling/Mary GrandPré            4.49 0439358078 9780439358071
## 3               J.K. Rowling            4.42 0439554896 9780439554893
## 4 J.K. Rowling/Mary GrandPré            4.56 043965548X 9780439655484
## 5 J.K. Rowling/Mary GrandPré            4.78 0439682584 9780439682589
## 6     W. Frederick Zimmerman            3.74 0976540606 9780976540601
##   language_code num_pages book_ratings_count text_reviews_count
## 1           eng       652            2095690              27591
## 2           eng       870            2153167              29221
## 3           eng       352               6333                244
## 4           eng       435            2339585              36325
## 5           eng      2690              41428                164
## 6         en-US       152                 19                  1
##   publication_date       publisher word_count sd ave_sentiment
## 1        9/16/2006 Scholastic Inc.         18  0     0.2000000
## 2         9/1/2004 Scholastic Inc.         10 NA     0.0000000
## 3        11/1/2003      Scholastic         18  0     0.0000000
## 4         5/1/2004 Scholastic Inc.         18  0    -0.2500000
## 5        9/13/2004      Scholastic          7 NA     0.0000000
## 6        4/26/2005    Nimble Books         12 NA    -0.1299038
head(books_two)
##   authorid                author authorworkcount author_fans  gender
## 1  8409092       Jason   Wallace               2          13    male
## 2  5796406          Rosan Hollak               4           0 unknown
## 3  8421525            Nanna Foss               6         156  female
## 4   158146     Terri Savelle Foy              23         125  female
## 5 15340731 Vishwas Nangare Patil               1         127 unknown
## 6  7189636           Shweta Punj               2           3 unknown
##                                                                                    image_url
## 1                              https://images.gr-assets.com/authors/1489266848p7/8409092.jpg
## 2 https://s.gr-assets.com/assets/nophoto/user/u_333x500-46491541e26dbeac15f51487d68dd207.png
## 3                              https://images.gr-assets.com/authors/1409085874p7/8421525.jpg
## 4                               https://images.gr-assets.com/authors/1475694606p7/158146.jpg
## 5 https://s.gr-assets.com/assets/nophoto/user/u_333x500-46491541e26dbeac15f51487d68dd207.png
## 6 https://s.gr-assets.com/assets/nophoto/user/u_333x500-46491541e26dbeac15f51487d68dd207.png
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   about
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Jason Wallace is related to Tolkien and a descendant of one of the first International English cricketers, and also of the world-renowned Victorian circus owner 'Lord' George Sanger. He was born in Cheltenham in 1969 but moved to London after his parents split up. Aged 12 his life was turned upside down when his mother remarried and the family emigrated to Zimbabwe. It was this experience in a tough boarding school during the aftermath of the war for independence that forms the foundation of his incredible first novel, Out of Shadows. And he did actually meet Robert Mugabe when he visited his school.<br /><br />Jason is currently a web designer, living in South West London with his partner and son.<br />
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Danish YA writer. <br /><br />I make up stories about time travel, friendship, love, and mysterious adventures. <br /><br />I like to read books that make me cry or laugh or both.<br /><br />Danish Social Media:<br />Instagram: <a target="_blank" href="http://www.instagram.com/nannafoss" rel="nofollow">www.instagram.com/nannafoss</a><br />Facebook: <a target="_blank" href="http://www.facebook.com/nannafoss" rel="nofollow">www.facebook.com/nannafoss</a><br />Blog: <a target="_blank" href="http://www.nannafoss.dk" rel="nofollow">www.nannafoss.dk</a> <br /><br />English Social Media:<br />Tumblr: <a target="_blank" href="http://www.nannafoss.tumblr.com" rel="nofollow">www.nannafoss.tumblr.com</a><br /><br />YouTube playlists for my books:<br /><a target="_blank" href="http://www.youtube.com/channel/UCUL5UmL4QUSPyywz4ULc-gg/playlists" rel="nofollow">www.youtube.com/channel/UCUL5UmL4QUSP...</a>
## 4 For years, Terri Savelle Foy’s life was average. She had no dreams to pursue. Each passing day was just a repeat of the day before. Finally, with a marriage in trouble and her life falling apart, Terri made a change. She began to pursue God like never before, develop a new routine and discovered the power of having a dream and purpose.<br /><br />As Terri started to recognize her own dreams and goals, she simply wrote them down and reviewed them consistently. This written vision became a road map to drive her life. As a result, those dreams are now a reality.<br /><br />Terri has become the CEO of an international Christian ministry. She is an author, a conference speaker, and a success coach to hundreds of thousands of people all over the world. Her best-selling books Make Your Dreams Bigger than Your Memories, and Imagine Big have helped people discover how to overcome the hurts of the past and see the possibilities of a limitless future. Her weekly podcast is a lifeline of hope and inspiration to people around the world.<br /><br />Terri Savelle Foy is a cheerleader of dreams and is convinced that “if you can dream it, God can do it.” She is known across the globe as a world-class motivator of hope and success through her transparent and humorous teaching style. Terri’s unique ability to communicate success strategies in a simple and practical way has awakened the dreams of the young and old alike. <br /><br />Terri shares from personal experience the biblical concepts of using the gift of the imagination to reach full potential in Jesus Christ. From stay-at-home moms to business executives, Terri consistently inspires others to go after their dreams. With step-by-step instruction and the inspiration to follow through, people are fueled with the passion to complete their life assignment down to the last detail (see John 17:4).<br /><br />Terri and her husband, Rodney Foy, have been married since 1991, and are the parents of a beautiful redheaded daughter, Kassidi Cherie. They live near Dallas, Texas. For more information about Terri, go to <a target="_blank" href="http://www.terri.com" rel="nofollow">www.terri.com</a>.<br />
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
##   born died influence avg_author_rating author_ratings_count
## 1                                  3.74                 1028
## 2                                  3.73                   15
## 3                                  4.35                 1172
## 4                                  4.56                 1054
## 5                                  4.15                  725
## 6                                  3.65                  231
##   author_review_count                 website         twitter
## 1                 175                                        
## 2                   1                                        
## 3                 205 http://www.nannafoss.dk                
## 4                 151    http://www.terri.com terrisavellefoy
## 5                  43                                        
## 6                  25                                        
##                                              genre original_hometown
## 1                                                         Cheltenham
## 2                                                                   
## 3                       fantasy,fiction,paranormal                  
## 4 religion and spirituality,self help,spirituality                  
## 5                                                                   
## 6                                                                   
##          country latitude longitude
## 1 United Kingdom 51.90006  -2.07972
## 2                      NA        NA
## 3                      NA        NA
## 4                      NA        NA
## 5                      NA        NA
## 6                      NA        NA
books_sa <- 
  inner_join(x = books_s,
            y = books_two,
            by = "author")

head(books_sa)
##   bookID
## 1      4
## 2     10
## 3     12
## 4     13
## 5     14
## 6     18
##                                                                                                          title
## 1                                                   Harry Potter and the Chamber of Secrets (Harry Potter  #2)
## 2                                                                 Harry Potter Collection (Harry Potter  #1-6)
## 3 The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy  #1-5)
## 4                       The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)
## 5                                  The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)
## 6                                     The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy  #1-5)
##          author avg_book_rating       isbn        isbn13 language_code
## 1  J.K. Rowling            4.42 0439554896 9780439554893           eng
## 2  J.K. Rowling            4.73 0439827604 9780439827607           eng
## 3 Douglas Adams            4.38 0517226952 9780517226957           eng
## 4 Douglas Adams            4.38 0345453743 9780345453747           eng
## 5 Douglas Adams            4.22 1400052920 9781400052929           eng
## 6 Douglas Adams            4.38 0517149257 9780517149256           eng
##   num_pages book_ratings_count text_reviews_count publication_date
## 1       352               6333                244        11/1/2003
## 2      3342              28242                808        9/12/2005
## 3       815               3628                254        11/1/2005
## 4       815             249558               4080        4/30/2002
## 5       215               4930                460         8/3/2004
## 6       815               2877                195        1/17/1996
##        publisher word_count sd ave_sentiment authorid authorworkcount
## 1     Scholastic         18  0     0.0000000  1077326             242
## 2     Scholastic          5 NA     0.0000000  1077326             242
## 3 Gramercy Books         15 NA     0.3098387        4             103
## 4  Del Rey Books         12 NA     0.3464102        4             103
## 5          Crown         44  0     0.3618136        4             103
## 6    Wings Books          9 NA     0.4000000        4             103
##   author_fans gender
## 1      209174 female
## 2      209174 female
## 3       19029   male
## 4       19029   male
## 5       19029   male
## 6       19029   male
##                                                       image_url
## 1 https://images.gr-assets.com/authors/1510435123p7/1077326.jpg
## 2 https://images.gr-assets.com/authors/1510435123p7/1077326.jpg
## 3       https://images.gr-assets.com/authors/1189120061p7/4.jpg
## 4       https://images.gr-assets.com/authors/1189120061p7/4.jpg
## 5       https://images.gr-assets.com/authors/1189120061p7/4.jpg
## 6       https://images.gr-assets.com/authors/1189120061p7/4.jpg
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     about
## 1 See also: <a href="https://www.goodreads.com/author/show/383606.Robert_Galbraith" title="Robert Galbraith" rel="nofollow">Robert Galbraith</a><br />Although she writes under the pen name <b>J.K. Rowling</b>, pronounced like <i>rolling</i>, her name when her first <i>Harry Potter</i> book was published was simply <b>Joanne Rowling</b>. Anticipating that the target audience of young boys might not want to read a book written by a woman, her publishers demanded that she use two initials, rather than her full name. As she had no middle name, she chose <b>K</b> as the second initial of her pen name, from her paternal grandmother Kathleen Ada Bulgen Rowling. She calls herself <b>Jo</b> and has said, "No one ever called me 'Joanne' when I was young, unless they were angry." Following her marriage, she has sometimes used the name <b>Joanne Murray</b> when conducting personal business. During the Leveson Inquiry she gave evidence under the name of <b>Joanne Kathleen Rowling</b>. In a 2012 interview, Rowling noted that she no longer cared that people pronounced her name incorrectly.<br /><br />Rowling was born to Peter James Rowling, a Rolls-Royce aircraft engineer, and Anne Rowling (née Volant), on 31 July 1965 in Yate, Gloucestershire, England, 10 miles (16 km) northeast of Bristol. Her mother Anne was half-French and half-Scottish. Her parents first met on a train departing from King's Cross Station bound for Arbroath in 1964. They married on 14 March 1965. Her mother's maternal grandfather, Dugald Campbell, was born in Lamlash on the Isle of Arran. Her mother's paternal grandfather, Louis Volant, was awarded the Croix de Guerre for exceptional bravery in defending the village of Courcelles-le-Comte during the First World War.<br /><br />Rowling's sister Dianne was born at their home when Rowling was 23 months old. The family moved to the nearby village Winterbourne when Rowling was four. She attended St Michael's Primary School, a school founded by abolitionist William Wilberforce and education reformer Hannah More. Her headmaster at St Michael's, Alfred Dunn, has been suggested as the inspiration for the <i>Harry Potter</i> headmaster Albus Dumbledore.<br /><br />As a child, Rowling often wrote fantasy stories, which she would usually then read to her sister. She recalls that: "I can still remember me telling her a story in which she fell down a rabbit hole and was fed strawberries by the rabbit family inside it. Certainly the first story I ever wrote down (when I was five or six) was about a rabbit called Rabbit. He got the measles and was visited by his friends, including a giant bee called Miss Bee." At the age of nine, Rowling moved to Church Cottage in the Gloucestershire village of Tutshill, close to Chepstow, Wales. When she was a young teenager, her great aunt, who Rowling said "taught classics and approved of a thirst for knowledge, even of a questionable kind," gave her a very old copy of Jessica Mitford's autobiography, <i>Hons and Rebels</i>. Mitford became Rowling's heroine, and Rowling subsequently read all of her books.<br /><br />Rowling has said of her teenage years, in an interview with The New Yorker, "I wasn’t particularly happy. I think it’s a dreadful time of life." She had a difficult homelife; her mother was ill and she had a difficult relationship with her father (she is no longer on speaking terms with him). She attended secondary school at Wyedean School and College, where her mother had worked as a technician in the science department. Rowling said of her adolescence, "Hermione [a bookish, know-it-all <i>Harry Potter</i> character] is loosely based on me. She's a caricature of me when I was eleven, which I'm not particularly proud of." Steve Eddy, who taught Rowling English when she first arrived, remembers her as "not exceptional" but "one of a group of girls who were bright, and quite good at English." Sean Harris, her best friend in the Upper Sixth owned a turquoise Ford Anglia, which she says inspired the one in her books.
## 2 See also: <a href="https://www.goodreads.com/author/show/383606.Robert_Galbraith" title="Robert Galbraith" rel="nofollow">Robert Galbraith</a><br />Although she writes under the pen name <b>J.K. Rowling</b>, pronounced like <i>rolling</i>, her name when her first <i>Harry Potter</i> book was published was simply <b>Joanne Rowling</b>. Anticipating that the target audience of young boys might not want to read a book written by a woman, her publishers demanded that she use two initials, rather than her full name. As she had no middle name, she chose <b>K</b> as the second initial of her pen name, from her paternal grandmother Kathleen Ada Bulgen Rowling. She calls herself <b>Jo</b> and has said, "No one ever called me 'Joanne' when I was young, unless they were angry." Following her marriage, she has sometimes used the name <b>Joanne Murray</b> when conducting personal business. During the Leveson Inquiry she gave evidence under the name of <b>Joanne Kathleen Rowling</b>. In a 2012 interview, Rowling noted that she no longer cared that people pronounced her name incorrectly.<br /><br />Rowling was born to Peter James Rowling, a Rolls-Royce aircraft engineer, and Anne Rowling (née Volant), on 31 July 1965 in Yate, Gloucestershire, England, 10 miles (16 km) northeast of Bristol. Her mother Anne was half-French and half-Scottish. Her parents first met on a train departing from King's Cross Station bound for Arbroath in 1964. They married on 14 March 1965. Her mother's maternal grandfather, Dugald Campbell, was born in Lamlash on the Isle of Arran. Her mother's paternal grandfather, Louis Volant, was awarded the Croix de Guerre for exceptional bravery in defending the village of Courcelles-le-Comte during the First World War.<br /><br />Rowling's sister Dianne was born at their home when Rowling was 23 months old. The family moved to the nearby village Winterbourne when Rowling was four. She attended St Michael's Primary School, a school founded by abolitionist William Wilberforce and education reformer Hannah More. Her headmaster at St Michael's, Alfred Dunn, has been suggested as the inspiration for the <i>Harry Potter</i> headmaster Albus Dumbledore.<br /><br />As a child, Rowling often wrote fantasy stories, which she would usually then read to her sister. She recalls that: "I can still remember me telling her a story in which she fell down a rabbit hole and was fed strawberries by the rabbit family inside it. Certainly the first story I ever wrote down (when I was five or six) was about a rabbit called Rabbit. He got the measles and was visited by his friends, including a giant bee called Miss Bee." At the age of nine, Rowling moved to Church Cottage in the Gloucestershire village of Tutshill, close to Chepstow, Wales. When she was a young teenager, her great aunt, who Rowling said "taught classics and approved of a thirst for knowledge, even of a questionable kind," gave her a very old copy of Jessica Mitford's autobiography, <i>Hons and Rebels</i>. Mitford became Rowling's heroine, and Rowling subsequently read all of her books.<br /><br />Rowling has said of her teenage years, in an interview with The New Yorker, "I wasn’t particularly happy. I think it’s a dreadful time of life." She had a difficult homelife; her mother was ill and she had a difficult relationship with her father (she is no longer on speaking terms with him). She attended secondary school at Wyedean School and College, where her mother had worked as a technician in the science department. Rowling said of her adolescence, "Hermione [a bookish, know-it-all <i>Harry Potter</i> character] is loosely based on me. She's a caricature of me when I was eleven, which I'm not particularly proud of." Steve Eddy, who taught Rowling English when she first arrived, remembers her as "not exceptional" but "one of a group of girls who were bright, and quite good at English." Sean Harris, her best friend in the Upper Sixth owned a turquoise Ford Anglia, which she says inspired the one in her books.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n  <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n  <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n  <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n  <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n  <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n  <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Douglas Noël Adams was an English author, comic radio dramatist, and musician. He is best known as the author of the <i>\n  <a href="https://www.goodreads.com/book/show/11.Hitchhiker_s_Guide_to_the_Galaxy" title="Hitchhiker's Guide to the Galaxy" rel="nofollow">Hitchhiker's Guide to the Galaxy</a>\n</i> series. Hitchhiker's began on radio, and developed into a "trilogy" of five books (which sold more than fifteen million copies during his lifetime) as well as a television series, a comic book series, a computer game, and a feature film that was completed after Adams' death. The series has also been adapted for live theatre using various scripts; the earliest such productions used material newly written by Adams. He was known to some fans as Bop Ad (after his illegible signature), or by his initials "DNA".<br /><br />In addition to <i>The Hitchhiker's Guide to the Galaxy</i>, Douglas Adams wrote or co-wrote three stories of the science fiction television series Doctor Who and served as Script Editor during the seventeenth season. His other written works include the Dirk Gently novels, and he co-wrote two Liff books and <i>Last Chance to See</i>, itself based on a radio series. Adams also originated the idea for the computer game <i>Starship Titanic</i>, which was produced by a company that Adams co-founded, and adapted into a novel by Terry Jones. A posthumous collection of essays and other material, including an incomplete novel, was published as <i>\n  <a href="https://www.goodreads.com/book/show/359.The_Salmon_of_Doubt" title="The Salmon of Doubt" rel="nofollow">The Salmon of Doubt</a>\n</i> in 2002.<br /><br />His fans and friends also knew Adams as an environmental activist and a lover of fast cars, cameras, the Macintosh computer, and other "techno gizmos". <br /><br />Toward the end of his life he was a sought-after lecturer on topics including technology and the environment.
##      born    died
## 1 7/31/65        
## 2 7/31/65        
## 3 3/11/52 5/11/01
## 4 3/11/52 5/11/01
## 5 3/11/52 5/11/01
## 6 3/11/52 5/11/01
##                                                                 influence
## 1                    C.S. Lewis,Oscar Wilde,Geoffrey Chaucer,Jane Austen,
## 2                    C.S. Lewis,Oscar Wilde,Geoffrey Chaucer,Jane Austen,
## 3 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
## 4 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
## 5 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
## 6 Jane Austen,Charles Dickens,P. G. Wodehouse,Kurt Vonnegut,Monty Python,
##   avg_author_rating author_ratings_count author_review_count
## 1              4.46             24511114              579250
## 2              4.46             24511114              579250
## 3              4.20              2624222               57565
## 4              4.20              2624222               57565
## 5              4.20              2624222               57565
## 6              4.20              2624222               57565
##                        website    twitter                                genre
## 1     http://www.jkrowling.com jk_rowling          fantasy,fiction,young adult
## 2     http://www.jkrowling.com jk_rowling          fantasy,fiction,young adult
## 3 http://www.douglasadams.com/            comedy,fiction,mystery and thrillers
## 4 http://www.douglasadams.com/            comedy,fiction,mystery and thrillers
## 5 http://www.douglasadams.com/            comedy,fiction,mystery and thrillers
## 6 http://www.douglasadams.com/            comedy,fiction,mystery and thrillers
##                      original_hometown        country latitude longitude
## 1 Yate, South Gloucestershire, England United Kingdom 51.54074  -2.41839
## 2 Yate, South Gloucestershire, England United Kingdom 51.54074  -2.41839
## 3                   Cambridge, England United Kingdom 52.20000   0.11667
## 4                   Cambridge, England United Kingdom 52.20000   0.11667
## 5                   Cambridge, England United Kingdom 52.20000   0.11667
## 6                   Cambridge, England United Kingdom 52.20000   0.11667

6) DATA CLEANING

# mutate to correct column data types
books_1 <- books_sa %>% mutate(num_pages = as.numeric(num_pages),
                                     avg_book_rating = as.numeric(avg_book_rating),
                                    text_reviews_count = as.numeric(text_reviews_count), 
                                    publication_date = as.Date(publication_date, format="%m/%d/%Y"),
                                    born = as.Date(born, format="%m/%d/%Y"),
                                    died = as.Date(died, format="%m/%d/%Y"),
                                    gender = as.factor(gender)
                                    ) 


# remove NAs
books_total <- books_1 %>% 
  filter(
    (!is.na(avg_book_rating)), (!is.na(book_ratings_count)), (!is.na(text_reviews_count)), (!is.na(publication_date)), 
    (!duplicated(title)), 
    (avg_book_rating != 0), 
    (author != "NOT A BOOK"),
    (!is_greater_than(num_pages, 2000)),
    (num_pages != 0),
    (bookID != 9796),
    (!is_less_than(num_pages, 10))
    )


# remove irrelevant variables (11):
# sd(standard deviation of words in title), author ID, image_URL, about, influence, website, twitter, original hometown, country, latitude, longitude

books_corti <- books_total %>% select(-isbn13,
                                      -sd,
                                      -authorid,
                                      -image_url,
                                      -about,
                                      -influence,
                                      -website,
                                      -twitter,
                                      -original_hometown,
                                      -country,
                                      -latitude,
                                      -longitude) %>% rename(
                                                      title_sentiment_avg = ave_sentiment,
                                                      title_word_count = word_count
                                      )
# View(books_corti)

7) DATA EXPLORATION

# NA VISUALIZATION
# to see the number of missing values in each column

# STEPS:
# 1) We need to sum through every column using a FOR loop.
# 2) Then print the variable name using names(movies[i]).
# 3) Finally, we print the sum of is.na() for just that variable.

# FOR loop to see each column in books data set
for(i in 1:ncol(books_corti)){
  
  # print the following
  print(
    
    # first print "Variable: "
    paste0("Variable: ", 
           
           # then print the variable name, then "NAs: "
           names(books_corti)[i], " NAs: ", 
           
           # then print the sum of the number of missing values 
           # for that variable
           sum(is.na(books_corti %>% select(i)))
          )
        
        )
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(i)` instead of `i` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## [1] "Variable: bookID NAs: 0"
## [1] "Variable: title NAs: 0"
## [1] "Variable: author NAs: 0"
## [1] "Variable: avg_book_rating NAs: 0"
## [1] "Variable: isbn NAs: 0"
## [1] "Variable: language_code NAs: 0"
## [1] "Variable: num_pages NAs: 0"
## [1] "Variable: book_ratings_count NAs: 0"
## [1] "Variable: text_reviews_count NAs: 0"
## [1] "Variable: publication_date NAs: 0"
## [1] "Variable: publisher NAs: 0"
## [1] "Variable: title_word_count NAs: 0"
## [1] "Variable: title_sentiment_avg NAs: 0"
## [1] "Variable: authorworkcount NAs: 0"
## [1] "Variable: author_fans NAs: 0"
## [1] "Variable: gender NAs: 0"
## [1] "Variable: born NAs: 2652"
## [1] "Variable: died NAs: 4210"
## [1] "Variable: avg_author_rating NAs: 0"
## [1] "Variable: author_ratings_count NAs: 0"
## [1] "Variable: author_review_count NAs: 0"
## [1] "Variable: genre NAs: 0"
# starts_with() function for certain columns...

books_corti %>% select(starts_with("isbn")) %>% glimpse()
## Rows: 5,914
## Columns: 1
## $ isbn <chr> "0439554896", "0517226952", "0345453743", "1400052920", "0517149…
# exploring first 10 rows using slice() function

explore_data <- books_corti %>% arrange(desc(avg_book_rating)) %>% slice(1:10) %>% select(title, author, avg_book_rating)
print(explore_data)
##                                                         title
## 1  Zone of the Enders: The 2nd Runner Official Strategy Guide
## 2     The Diamond Color Meditation: Color Pathway to the Soul
## 3                                   Taxation of Mineral Rents
## 4               The Irish Anatomist: A Study of Flann O'Brien
## 5   His Princess Devotional: A Royal Encounter With Your King
## 6                                     Stargirl LitPlans on CD
## 7                              The Complete Calvin and Hobbes
## 8        Wissenschaft der Logik: Die Lehre Vom Begriff (1816)
## 9               It's a Magical World (Calvin and Hobbes  #11)
## 10         Homicidal Psycho Jungle Cat (Calvin and Hobbes #9)
##                           author avg_book_rating
## 1                     Tim Bogenn            5.00
## 2                  John  Diamond            5.00
## 3                   Ross Garnaut            5.00
## 4                  Keith Donohue            5.00
## 5            Sheri Rose Shepherd            5.00
## 6                Mary B. Collins            4.86
## 7                 Bill Watterson            4.82
## 8  Georg Wilhelm Friedrich Hegel            4.78
## 9                 Bill Watterson            4.76
## 10                Bill Watterson            4.72
datatable(books_corti)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
# ONLY select "NOT A BOOK" under author variable (a.k.a. the column) and store this as a new data frame

not_a_book <- books_corti %>% filter(author == "NOT A BOOK") %>% nrow()
print(not_a_book)
## [1] 0

Expectation:

Linear regression analysis is sensitive to outliers. Use histogram to see where this will occur.

ggplot(books_corti, aes(x = avg_book_rating)) + 
  xlab("Average Book Rating") + 
  ylab("Count") + 
  geom_histogram(fill = "skyblue", color = "#879bcd") + 
  theme_dark(base_size = 18) + 
  ggtitle("               Histogram to View Outliers")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p <- books_corti %>%
  ggplot(aes(avg_book_rating, title_sentiment_avg)) +
  xlab("Average Book Rating") + ylab("Title Sentiment") +
  geom_point(color = "skyblue", alpha = 1/2, size = 0.5) +
  theme_bw(base_size = 18) +
  ggtitle("Exploring the Data: Visualization 1")

ggplotly(p)
p <- ggplot(books_corti %>%
         mutate(genderMutated = fct_lump(gender, n = 10)),
       aes(x = avg_book_rating, y = genderMutated, fill = genderMutated)) +
  theme_minimal(base_size = 18) +
  geom_density_ridges(color="black") +
  xlab("Average Book Rating") +
  ylab("Gender of Author") +  
  ggtitle("   Exploring the Data: Visualization 2")

p + theme(legend.position = "none")
## Picking joint bandwidth of 0.0532

8) EXAMINING DATA STRUCTURE

str(books_corti)
## 'data.frame':    5914 obs. of  22 variables:
##  $ bookID              : chr  "4" "12" "13" "14" ...
##  $ title               : chr  "Harry Potter and the Chamber of Secrets (Harry Potter  #2)" "The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy  #1-5)" "The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)" "The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)" ...
##  $ author              : chr  "J.K. Rowling" "Douglas Adams" "Douglas Adams" "Douglas Adams" ...
##  $ avg_book_rating     : num  4.42 4.38 4.38 4.22 4.38 4.21 3.44 3.87 4.07 3.9 ...
##  $ isbn                : chr  "0439554896" "0517226952" "0345453743" "1400052920" ...
##  $ language_code       : chr  "eng" "eng" "eng" "eng" ...
##  $ num_pages           : num  352 815 815 215 815 544 55 256 335 304 ...
##  $ book_ratings_count  : int  6333 3628 249558 4930 2877 248558 7270 2088 72451 49240 ...
##  $ text_reviews_count  : num  244 254 4080 460 195 ...
##  $ publication_date    : Date, format: "2003-11-01" "2005-11-01" ...
##  $ publisher           : chr  "Scholastic" "Gramercy Books" "Del Rey Books" "Crown" ...
##  $ title_word_count    : int  18 15 12 44 9 12 8 12 4 14 ...
##  $ title_sentiment_avg : num  0 0.31 0.346 0.362 0.4 ...
##  $ authorworkcount     : int  242 103 103 103 103 59 59 59 59 59 ...
##  $ author_fans         : int  209174 19029 19029 19029 19029 14356 14356 14356 14356 14356 ...
##  $ gender              : Factor w/ 3 levels "female","male",..: 1 2 2 2 2 2 2 2 2 2 ...
##  $ born                : Date, format: "0065-07-31" "0052-03-11" ...
##  $ died                : Date, format: NA "0001-05-11" ...
##  $ avg_author_rating   : num  4.46 4.2 4.2 4.2 4.2 4.03 4.03 4.03 4.03 4.03 ...
##  $ author_ratings_count: int  24511114 2624222 2624222 2624222 2624222 1272002 1272002 1272002 1272002 1272002 ...
##  $ author_review_count : int  579250 57565 57565 57565 57565 76846 76846 76846 76846 76846 ...
##  $ genre               : chr  "fantasy,fiction,young adult" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" "comedy,fiction,mystery and thrillers" ...

9) SUMMARY STATS

options(qwraps2_markup = "markdown")
view(books_corti)
our_summary1 <-
    list("Average Book Rating" =
       list("min"       = ~ min(avg_book_rating),
            "mean"      = ~ mean(avg_book_rating),
            "max"       = ~ max(avg_book_rating),
            "st. dev"   = ~ sd(avg_book_rating)),
       "Number of Pages" =
       list("min"       = ~ min(num_pages),
            "mean"    = ~ mean(num_pages),
            "max"       = ~ max(num_pages),
            "st.dev" = ~ sd(num_pages)),
       "Book Ratings Count" =
       list("min"       = ~ min(book_ratings_count),
            "mean"      = ~ mean(book_ratings_count),
            "max"       = ~ max(book_ratings_count),
            "st. dev"   = ~ sd(book_ratings_count)),
        "Text Reviews Count" =
      list("min"       = ~ min(text_reviews_count),
            "mean"      = ~ mean(text_reviews_count),
            "max"       = ~ max(text_reviews_count),
            "st. dev"   = ~ sd(text_reviews_count)),
      "Average Title Sentiment Score" =
      list("min"       = ~ min(title_sentiment_avg),
            "mean"      = ~ mean(title_sentiment_avg),
            "max"       = ~ max(title_sentiment_avg),
            "st. dev"   = ~ sd(title_sentiment_avg)),
      "Author's Work Count" =
      list("min"       = ~ min(authorworkcount),
            "mean"      = ~ mean(authorworkcount),
            "max"       = ~ max(authorworkcount),
            "st. dev"   = ~ sd(authorworkcount)),
      "Author's Fan Count" =
      list("min"       = ~ min(author_fans),
            "mean"      = ~ mean(author_fans),
            "max"       = ~ max(author_fans),
            "st. dev"   = ~ sd(author_fans)),
      "Author Ratings Count" =
      list("min"       = ~ min(author_ratings_count),
            "mean"      = ~ mean(author_ratings_count),
            "max"       = ~ max(author_ratings_count),
            "st. dev"   = ~ sd(author_ratings_count)),
      "Author Review Count" =
      list("min"       = ~ min(author_review_count),
            "mean"      = ~ mean(author_review_count),
            "max"       = ~ max(author_review_count),
            "st. dev"   = ~ sd(author_review_count))
      )
sum_stats <- summary_table(books_corti, our_summary1) %>% round(1)

print(sum_stats)
## 
## 
## |                                  |books_corti (N = 5,914) |
## |:---------------------------------|:-----------------------|
## |**Average Book Rating**           |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |1                       |
## |&nbsp;&nbsp; mean                 |3.9                     |
## |&nbsp;&nbsp; max                  |5                       |
## |&nbsp;&nbsp; st. dev              |0.3                     |
## |**Number of Pages**               |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |10                      |
## |&nbsp;&nbsp; mean                 |350.4                   |
## |&nbsp;&nbsp; max                  |1952                    |
## |&nbsp;&nbsp; st.dev               |195.2                   |
## |**Book Ratings Count**            |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |0                       |
## |&nbsp;&nbsp; mean                 |21480.8                 |
## |&nbsp;&nbsp; max                  |4597666                 |
## |&nbsp;&nbsp; st. dev              |120423.5                |
## |**Text Reviews Count**            |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |0                       |
## |&nbsp;&nbsp; mean                 |696.7                   |
## |&nbsp;&nbsp; max                  |94265                   |
## |&nbsp;&nbsp; st. dev              |2844.8                  |
## |**Average Title Sentiment Score** |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |-1.4                    |
## |&nbsp;&nbsp; mean                 |0                       |
## |&nbsp;&nbsp; max                  |1.3                     |
## |&nbsp;&nbsp; st. dev              |0.3                     |
## |**Author's Work Count**           |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |1                       |
## |&nbsp;&nbsp; mean                 |231                     |
## |&nbsp;&nbsp; max                  |5204                    |
## |&nbsp;&nbsp; st. dev              |488.1                   |
## |**Author's Fan Count**            |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |0                       |
## |&nbsp;&nbsp; mean                 |12050                   |
## |&nbsp;&nbsp; max                  |709826                  |
## |&nbsp;&nbsp; st. dev              |55558.6                 |
## |**Author Ratings Count**          |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |27                      |
## |&nbsp;&nbsp; mean                 |658708                  |
## |&nbsp;&nbsp; max                  |24511114                |
## |&nbsp;&nbsp; st. dev              |1727055.4               |
## |**Author Review Count**           |&nbsp;&nbsp;            |
## |&nbsp;&nbsp; min                  |1                       |
## |&nbsp;&nbsp; mean                 |26511.1                 |
## |&nbsp;&nbsp; max                  |579250                  |
## |&nbsp;&nbsp; st. dev              |58813.9                 |

10) LINEAR MODEL VALIDATION: TRAIN-TEST-SPLIT

Need to load ‘rsample’ library here.

set.seed(1818)

train_prop <- 0.8

books_split <- initial_split(books_corti, prop = train_prop)

books_train <- training(books_split)
books_test <- testing(books_split)
nrow(books_train)
## [1] 4732
nrow(books_test)
## [1] 1182
head(books_train)
##   bookID
## 1      4
## 2     12
## 3     13
## 4     14
## 5     18
## 6     21
##                                                                                                          title
## 1                                                   Harry Potter and the Chamber of Secrets (Harry Potter  #2)
## 2 The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy  #1-5)
## 3                       The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)
## 4                                  The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)
## 5                                     The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy  #1-5)
## 6                                                                         A Short History of Nearly Everything
##          author avg_book_rating       isbn language_code num_pages
## 1  J.K. Rowling            4.42 0439554896           eng       352
## 2 Douglas Adams            4.38 0517226952           eng       815
## 3 Douglas Adams            4.38 0345453743           eng       815
## 4 Douglas Adams            4.22 1400052920           eng       215
## 5 Douglas Adams            4.38 0517149257           eng       815
## 6   Bill Bryson            4.21 076790818X           eng       544
##   book_ratings_count text_reviews_count publication_date      publisher
## 1               6333                244       2003-11-01     Scholastic
## 2               3628                254       2005-11-01 Gramercy Books
## 3             249558               4080       2002-04-30  Del Rey Books
## 4               4930                460       2004-08-03          Crown
## 5               2877                195       1996-01-17    Wings Books
## 6             248558               9396       2004-09-14 Broadway Books
##   title_word_count title_sentiment_avg authorworkcount author_fans gender
## 1               18           0.0000000             242      209174 female
## 2               15           0.3098387             103       19029   male
## 3               12           0.3464102             103       19029   male
## 4               44           0.3618136             103       19029   male
## 5                9           0.4000000             103       19029   male
## 6               12           0.0000000              59       14356   male
##         born       died avg_author_rating author_ratings_count
## 1 0065-07-31       <NA>              4.46             24511114
## 2 0052-03-11 0001-05-11              4.20              2624222
## 3 0052-03-11 0001-05-11              4.20              2624222
## 4 0052-03-11 0001-05-11              4.20              2624222
## 5 0052-03-11 0001-05-11              4.20              2624222
## 6 0051-12-08       <NA>              4.03              1272002
##   author_review_count                                genre
## 1              579250          fantasy,fiction,young adult
## 2               57565 comedy,fiction,mystery and thrillers
## 3               57565 comedy,fiction,mystery and thrillers
## 4               57565 comedy,fiction,mystery and thrillers
## 5               57565 comedy,fiction,mystery and thrillers
## 6               76846                   non fiction,travel

11) MODEL 1: LINEAR REGRESSION

Need ‘dplyr’, ‘glmnet’, and ‘glmnetUtils’ libraries here.

options(scipen = 999)
mod1 <- lm(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender, data = books_train)

summary(mod1)
## 
## Call:
## lm(formula = avg_book_rating ~ num_pages + book_ratings_count + 
##     text_reviews_count + title_sentiment_avg + authorworkcount + 
##     author_fans + author_ratings_count + author_review_count + 
##     gender, data = books_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8325 -0.1575  0.0139  0.1787  1.1391 
## 
## Coefficients:
##                            Estimate     Std. Error t value             Pr(>|t|)
## (Intercept)           3.79675179461  0.01058977825 358.530 < 0.0000000000000002
## num_pages             0.00027651722  0.00002059907  13.424 < 0.0000000000000002
## book_ratings_count   -0.00000005966  0.00000007126  -0.837             0.402464
## text_reviews_count    0.00000817665  0.00000320658   2.550             0.010805
## title_sentiment_avg   0.04097254653  0.01483542542   2.762             0.005771
## authorworkcount       0.00003276625  0.00000871417   3.760             0.000172
## author_fans           0.00000013706  0.00000013538   1.012             0.311392
## author_ratings_count  0.00000006167  0.00000000699   8.823 < 0.0000000000000002
## author_review_count  -0.00000172266  0.00000025130  -6.855     0.00000000000804
## gendermale            0.03361376024  0.00913831936   3.678             0.000237
## genderunknown         0.01812663016  0.01336993658   1.356             0.175236
##                         
## (Intercept)          ***
## num_pages            ***
## book_ratings_count      
## text_reviews_count   *  
## title_sentiment_avg  ** 
## authorworkcount      ***
## author_fans             
## author_ratings_count ***
## author_review_count  ***
## gendermale           ***
## genderunknown           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.275 on 4721 degrees of freedom
## Multiple R-squared:  0.06875,    Adjusted R-squared:  0.06678 
## F-statistic: 34.85 on 10 and 4721 DF,  p-value: < 0.00000000000000022

#——————————————————– # estimating “prettier” regression output #——————————————————–

Need ‘sjPlot’ and ‘tidymodels’ libraries.

#——————————————————– tab_model() outputs a table of results #——————————————————–

tab_model(mod1, digits = 3)
  avg book rating
Predictors Estimates CI p
(Intercept) 3.797 3.776 – 3.818 <0.001
num_pages 0.000 0.000 – 0.000 <0.001
book_ratings_count -0.000 -0.000 – 0.000 0.402
text_reviews_count 0.000 0.000 – 0.000 0.011
title_sentiment_avg 0.041 0.012 – 0.070 0.006
authorworkcount 0.000 0.000 – 0.000 <0.001
author_fans 0.000 -0.000 – 0.000 0.311
author_ratings_count 0.000 0.000 – 0.000 <0.001
author_review_count -0.000 -0.000 – -0.000 <0.001
gender [male] 0.034 0.016 – 0.052 <0.001
gender [unknown] 0.018 -0.008 – 0.044 0.175
Observations 4732
R2 / R2 adjusted 0.069 / 0.067

#——————————————————– plot_model() outputs a plot of regression coefficients #——————————————————–

plot_model(mod1)+ ylim(-0.1,0.1)  + ggtitle("            Average Book Rating Coefficients") + theme_minimal(base_size = 16)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

#——————————————————– tidy() outputs a table of coefficients and their p-values, t-stats #——————————————————–

tidy(mod1)
## # A tibble: 11 x 5
##    term                      estimate     std.error statistic  p.value
##    <chr>                        <dbl>         <dbl>     <dbl>    <dbl>
##  1 (Intercept)           3.80         0.0106          359.    0.      
##  2 num_pages             0.000277     0.0000206        13.4   2.39e-40
##  3 book_ratings_count   -0.0000000597 0.0000000713     -0.837 4.02e- 1
##  4 text_reviews_count    0.00000818   0.00000321        2.55  1.08e- 2
##  5 title_sentiment_avg   0.0410       0.0148            2.76  5.77e- 3
##  6 authorworkcount       0.0000328    0.00000871        3.76  1.72e- 4
##  7 author_fans           0.000000137  0.000000135       1.01  3.11e- 1
##  8 author_ratings_count  0.0000000617 0.00000000699     8.82  1.55e-18
##  9 author_review_count  -0.00000172   0.000000251      -6.86  8.04e-12
## 10 gendermale            0.0336       0.00914           3.68  2.37e- 4
## 11 genderunknown         0.0181       0.0134            1.36  1.75e- 1

12) MODEL 2: ELASTIC NET

Note: We used an alpha sequence from 0 to 1 in steps of 0.1.

enet_mod <- cva.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
                       data = books_train,
                       alpha = seq(0,1, by = 0.1))
print(enet_mod)
## Call:
## cva.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count + 
##     text_reviews_count + title_sentiment_avg + authorworkcount + 
##     author_fans + author_ratings_count + author_review_count + 
##     gender, data = books_train, alpha = seq(0, 1, by = 0.1))
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Alpha values: 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
##     Number of crossvalidation folds for lambda: 10
plot(enet_mod)

minlossplot(enet_mod, 
            cv.type = "min")

13) EXTRACT BEST LINEAR MODEL

# Use this function to find the best alpha.
get_alpha <- function(fit) {
  alpha <- fit$alpha
  error <- sapply(fit$modlist, 
                  function(mod) {min(mod$cvm)})
  alpha[which.min(error)]
}

# Get all parameters.
get_model_params <- function(fit) {
  alpha <- fit$alpha
  lambdaMin <- sapply(fit$modlist, `[[`, "lambda.min")
  lambdaSE <- sapply(fit$modlist, `[[`, "lambda.1se")
  error <- sapply(fit$modlist, function(mod) {min(mod$cvm)})
  best <- which.min(error)
  data.frame(alpha = alpha[best], lambdaMin = lambdaMin[best],
             lambdaSE = lambdaSE[best], error = error[best])
}

# Extract the best alpha value & model parameters.
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
## [1] 1
get_model_params(enet_mod)
##   alpha    lambdaMin   lambdaSE      error
## 1     1 0.0001936006 0.01273763 0.07581943
# Extract the best model object.
best_mod <- enet_mod$modlist[[which(enet_mod$alpha == best_alpha)]]

14) MODEL 3: BEST ELASTIC NET MODEL

enet_best_mod <- cv.glmnet(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
                       data = books_train,
                       alpha = 0.1)
summary(enet_best_mod)
##                 Length Class  Mode     
## lambda          92     -none- numeric  
## cvm             92     -none- numeric  
## cvsd            92     -none- numeric  
## cvup            92     -none- numeric  
## cvlo            92     -none- numeric  
## nzero           92     -none- numeric  
## call             4     -none- call     
## name             1     -none- character
## glmnet.fit      12     elnet  list     
## lambda.min       1     -none- numeric  
## lambda.1se       1     -none- numeric  
## terms            2     -none- call     
## xlev             9     -none- list     
## alpha            1     -none- numeric  
## nfolds           1     -none- numeric  
## sparse           1     -none- logical  
## use.model.frame  1     -none- logical  
## na.action        1     -none- character
print(enet_best_mod)
## Call:
## cv.glmnet.formula(formula = avg_book_rating ~ num_pages + book_ratings_count + 
##     text_reviews_count + title_sentiment_avg + authorworkcount + 
##     author_fans + author_ratings_count + author_review_count + 
##     gender, data = books_train, alpha = 0.1)
## 
## Model fitting options:
##     Sparse model matrix: FALSE
##     Use model.frame: FALSE
##     Number of crossvalidation folds: 10
##     Alpha: 0.1
##     Deviance-minimizing lambda: 0.0001430848  (+1 SE): 0.3229453

Print the model’s two suggested values of lambda.

print(enet_best_mod$lambda.min)
## [1] 0.0001430848
print(enet_best_mod$lambda.1se)
## [1] 0.3229453

Plot how the MSE varies as we vary lambda.

plot(enet_best_mod)

coefpath(enet_best_mod)

Compare lambda min & lambda 1SE…

# put into coefficient vector
enet_coefs <- data.frame(
  `lasso_min` = coef(enet_best_mod, s = enet_best_mod$lambda.min) %>%
    as.matrix() %>% data.frame() %>% round(3),
  `lasso_1se` = coef(enet_best_mod, s = enet_best_mod$lambda.1se) %>% 
    as.matrix() %>% data.frame() %>% round(3)
) %>%  rename(`lasso_min` = 1, `lasso_1se` = 2)

print(enet_coefs)
##                      lasso_min lasso_1se
## (Intercept)              3.815     3.903
## num_pages                0.000     0.000
## book_ratings_count       0.000     0.000
## text_reviews_count       0.000     0.000
## title_sentiment_avg      0.041     0.000
## authorworkcount          0.000     0.000
## author_fans              0.000     0.000
## author_ratings_count     0.000     0.000
## author_review_count      0.000     0.000
## genderfemale            -0.018     0.000
## gendermale               0.015     0.000
## genderunknown            0.000     0.000
enet_coefs %>% kable() %>% kable_styling()
lasso_min lasso_1se
(Intercept) 3.815 3.903
num_pages 0.000 0.000
book_ratings_count 0.000 0.000
text_reviews_count 0.000 0.000
title_sentiment_avg 0.041 0.000
authorworkcount 0.000 0.000
author_fans 0.000 0.000
author_ratings_count 0.000 0.000
author_review_count 0.000 0.000
genderfemale -0.018 0.000
gendermale 0.015 0.000
genderunknown 0.000 0.000

15) MODEL 3: BOOTSTRAP AGGREGATING (BAGGING)

Need ‘partykit’, ‘PerformanceAnalytics’, ‘rpart’, ‘rpart.plot’, and ‘randomForest’ libraries.

options(scipen = 10)
#set.seed(1818)
# store row names as columns
books_boot_preds <- books_corti %>% rownames_to_column() %>% 
  mutate(rowname = as.numeric(rowname))

B <- 100      # number of bootstrap samples
num_b <- 500  # sample size of each bootstrap
boot_mods <- list() # store our bagging models
for(i in 1:B){
  boot_idx <- sample(1:nrow(books_corti), 
                     size = num_b,
                     replace = FALSE)
  # fit a tree on each bootstrap sample
  boot_tree <- ctree(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count+ gender, 
                     data = books_corti %>% 
                       slice(boot_idx)) 
  # store bootstraped model
  boot_mods[[i]] <- boot_tree
  # generate predictions for that bootstrap model
  preds_boot <- data.frame(
    preds_boot = predict(boot_tree),
    rowname = boot_idx 
  )  
  # rename prediction to indicate which boot iteration it came from
  names(preds_boot)[1] <- paste("preds_boot",i,sep = "")
  # merge predictions to dataset
  books_boot_preds <- left_join(x = books_boot_preds, y = preds_boot,
                                  by = "rowname")
}

#——————————————————– plot() examines an individual model from bagging #——————————————————–

plot(boot_mods[[1]], gp = gpar(fontsize = 8))

books_boot_preds <- books_boot_preds %>% 
  mutate(preds_bag = 
           select(., preds_boot1:preds_boot100) %>% 
           rowMeans(na.rm = TRUE))

# NOTE: At this point in the code, the model has been bootstrapped.

16) MODEL 4: RANDOM FOREST

rf_fit <- randomForest(avg_book_rating ~ num_pages + book_ratings_count + text_reviews_count + title_sentiment_avg + authorworkcount + author_fans + author_ratings_count + author_review_count + gender,
                       data = books_corti,
                       type = regression,
                       mtry = 11/3,
                       ntree = 200,
                       importance = TRUE)

print(rf_fit)
## 
## Call:
##  randomForest(formula = avg_book_rating ~ num_pages + book_ratings_count +      text_reviews_count + title_sentiment_avg + authorworkcount +      author_fans + author_ratings_count + author_review_count +      gender, data = books_corti, type = regression, mtry = 11/3,      ntree = 200, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 200
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.06013366
##                     % Var explained: 24.51
plot(rf_fit)

varImpPlot(rf_fit, type = 1)

plot_min_depth_distribution(rf_fit)

plot_predict_interaction(rf_fit, books_corti, "author_ratings_count", "num_pages")

plot_predict_interaction(rf_fit, books_corti, "authorworkcount", "num_pages")

plot_predict_interaction(rf_fit, books_corti, "num_pages", "title_sentiment_avg")

Storing predictions data frames for Linear and ElasticNet models…

lm_preds_train <- predict(mod1, newdata = books_train)
lm_preds_test <- predict(mod1, 
                         newdata = books_test)
enet_preds_train <- predict(enet_best_mod, 
                             newdata = books_train,  s = "lambda.min")
enet_preds_test <- predict(enet_best_mod,
                      newdata = books_test,  s = "lambda.min")
head(lm_preds_train)
##        1        2        3        4        5        6 
## 4.446006 4.138931 4.157041 3.976757 4.142188 3.992752
head(lm_preds_test)
##       10       20       21       38       39       43 
## 3.874056 3.856204 3.880741 3.859920 3.838027 3.983870
head(enet_preds_train)
##          1
## 1 4.439726
## 2 4.137918
## 3 4.156343
## 4 3.975785
## 5 4.141186
## 6 3.993360
head(enet_preds_test)
##           1
## 10 3.875319
## 20 3.856850
## 21 3.880675
## 38 3.859698
## 39 3.837804
## 43 3.983638

Storing results data frames for Linear and ElasticNet models…

training_predictions <- data.frame(lm_preds_train, enet_preds_train)

results_train <- data.frame(books_train, training_predictions) %>% rename(enet_training = X1)

head(results_train)
##   bookID
## 1      4
## 2     12
## 3     13
## 4     14
## 5     18
## 6     21
##                                                                                                          title
## 1                                                   Harry Potter and the Chamber of Secrets (Harry Potter  #2)
## 2 The Ultimate Hitchhiker's Guide: Five Complete Novels and One Story (Hitchhiker's Guide to the Galaxy  #1-5)
## 3                       The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)
## 4                                  The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1)
## 5                                     The Ultimate Hitchhiker's Guide (Hitchhiker's Guide to the Galaxy  #1-5)
## 6                                                                         A Short History of Nearly Everything
##          author avg_book_rating       isbn language_code num_pages
## 1  J.K. Rowling            4.42 0439554896           eng       352
## 2 Douglas Adams            4.38 0517226952           eng       815
## 3 Douglas Adams            4.38 0345453743           eng       815
## 4 Douglas Adams            4.22 1400052920           eng       215
## 5 Douglas Adams            4.38 0517149257           eng       815
## 6   Bill Bryson            4.21 076790818X           eng       544
##   book_ratings_count text_reviews_count publication_date      publisher
## 1               6333                244       2003-11-01     Scholastic
## 2               3628                254       2005-11-01 Gramercy Books
## 3             249558               4080       2002-04-30  Del Rey Books
## 4               4930                460       2004-08-03          Crown
## 5               2877                195       1996-01-17    Wings Books
## 6             248558               9396       2004-09-14 Broadway Books
##   title_word_count title_sentiment_avg authorworkcount author_fans gender
## 1               18           0.0000000             242      209174 female
## 2               15           0.3098387             103       19029   male
## 3               12           0.3464102             103       19029   male
## 4               44           0.3618136             103       19029   male
## 5                9           0.4000000             103       19029   male
## 6               12           0.0000000              59       14356   male
##         born       died avg_author_rating author_ratings_count
## 1 0065-07-31       <NA>              4.46             24511114
## 2 0052-03-11 0001-05-11              4.20              2624222
## 3 0052-03-11 0001-05-11              4.20              2624222
## 4 0052-03-11 0001-05-11              4.20              2624222
## 5 0052-03-11 0001-05-11              4.20              2624222
## 6 0051-12-08       <NA>              4.03              1272002
##   author_review_count                                genre lm_preds_train
## 1              579250          fantasy,fiction,young adult       4.446006
## 2               57565 comedy,fiction,mystery and thrillers       4.138931
## 3               57565 comedy,fiction,mystery and thrillers       4.157041
## 4               57565 comedy,fiction,mystery and thrillers       3.976757
## 5               57565 comedy,fiction,mystery and thrillers       4.142188
## 6               76846                   non fiction,travel       3.992752
##   enet_training
## 1      4.439726
## 2      4.137918
## 3      4.156343
## 4      3.975785
## 5      4.141186
## 6      3.993360
testing_predictions <- data.frame(
                       "lm_testing" = lm_preds_test,
                        "enet_testing" = enet_preds_test)

results_test <- data.frame(books_test, testing_predictions) %>% rename(enet_testing = X1)

head(results_test)
##    bookID
## 10     25
## 20     53
## 21     55
## 38     86
## 39     89
## 43    105
##                                                                                              title
## 10               I'm a Stranger Here Myself: Notes on Returning to America After Twenty Years Away
## 20                                       Guts: The True Stories behind Hatchet and the Brian Books
## 21                                                  Hatchet Jobs: Writings on Contemporary Fiction
## 38                             The Heidi Chronicles: Uncommon Women and Others & Isn't It Romantic
## 39 Active Literacy Across the Curriculum: Strategies for Reading  Writing  Speaking  and Listening
## 43                                                         Chapterhouse: Dune (Dune Chronicles #6)
##                author avg_book_rating       isbn language_code num_pages
## 10        Bill Bryson            3.90 076790382X           eng       304
## 20       Gary Paulsen            3.88 0385326505           eng       144
## 21          Dale Peck            3.45 1595580271         en-US       228
## 38  Wendy Wasserstein            3.84 0679734996           eng       249
## 39 Heidi Hayes Jacobs            3.94 1596670231           eng       138
## 43      Frank Herbert            3.91 0441102670           eng       436
##    book_ratings_count text_reviews_count publication_date       publisher
## 10              49240               2211       2000-06-28  Broadway Books
## 20               2067                334       2001-01-23 Delacorte Press
## 21                 99                 16       2005-11-01   The New Press
## 38               2766                 64       1991-07-02         Vintage
## 39                 31                  1       2006-03-29       Routledge
## 43              38778                568       1987-07-01       Ace Books
##    title_word_count title_sentiment_avg authorworkcount author_fans  gender
## 10               14         -0.13363062              59       14356    male
## 20               10          0.07905694             224        2321    male
## 21                6          0.06123724              26          91 unknown
## 38               10         -0.15811388              35          55  female
## 39               12          0.02886751              61           8  female
## 43                4          0.00000000             308        7613    male
##          born       died avg_author_rating author_ratings_count
## 10 0051-12-08       <NA>              4.03              1272002
## 20 0039-05-17       <NA>              3.79               461181
## 21 0067-06-13       <NA>              3.67                 6261
## 38 0050-10-18 0006-01-30              3.63                 8656
## 39 0048-10-08       <NA>              3.76                  801
## 43 0020-10-08 0086-02-11              4.10              1190679
##    author_review_count                              genre lm_testing
## 10               76846                 non fiction,travel   3.874056
## 20               32464 fiction,literature,nature,outdoors   3.856204
## 21                 620                    sex,young adult   3.880741
## 38                 727                 fiction,literature   3.859920
## 39                  71                        non fiction   3.838027
## 43               31318                    fantasy,fiction   3.983870
##    enet_testing
## 10     3.875319
## 20     3.856850
## 21     3.880675
## 38     3.859698
## 39     3.837804
## 43     3.983638

17) GGPLOT OF LINEAR REGRESSION: TRAINING RESULTS

ggplot(results_train, aes(x = avg_book_rating, y = lm_preds_train)) + 
   geom_point(alpha = 1/10, size = 4) +
  theme_minimal(base_size = 16)+
  geom_abline(color = "turquoise")+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("              Linear Regression: Training True vs Predicted")

18) GGPLOT OF ELASTIC NET: TRAINING RESULTS

ggplot(results_train, aes(x = avg_book_rating, y = enet_preds_train)) + 
   geom_point(alpha = 1/10, size = 4) +
  theme_minimal(base_size = 16)+
  geom_abline(color = "turquoise")+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("        Best ElasticNet: Training True vs Predicted")

19) GGPLOT OF LINEAR REGRESSION: TESTING RESULTS

ggplot(results_test, aes(x = avg_book_rating, y = lm_preds_test)) + 
  geom_point(alpha = 1/10, size = 4) +
  geom_abline(color = "coral")+
  theme_minimal(base_size = 16)+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("              Linear Regression: Testing True vs Predicted")

20) GGPLOT OF ELASTIC NET: TESTING RESULTS

ggplot(results_test, aes(x = avg_book_rating, y = enet_preds_test)) + 
  geom_point(alpha = 1/10, size = 4) +
  geom_abline(color = "coral")+
  theme_minimal(base_size = 16)+
  xlab("True Average Ratings")+
  ylab("Predicted Average Ratings")+
  xlim(0, 5) + ylim(0, 5)+
    ggtitle("         Best ElasticNet: Testing True vs Predicted")

#——————————————————– # 21) MODEL EVALUATION #——————————————————–

LINEAR REGRESSION TRAINING METRICS

rmse(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.275
mae(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard       0.209
rsq(books_train, truth = avg_book_rating, estimate = lm_preds_train)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0687

LINEAR REGRESSION TEST METRICS

lm_rmse <- rmse(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_mae <- mae(books_test, truth = avg_book_rating, estimate = lm_preds_test)
lm_rsq <- rsq(books_test, truth = avg_book_rating, estimate = lm_preds_test)

ELASTIC NET TRAINING METRICS

rmse(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.275
mae(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard       0.209
rsq(books_train, truth = avg_book_rating, estimate = as.vector(enet_preds_train))
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard      0.0687

ELASTIC NET TEST METRICS

enet_rmse <- rmse(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_mae <- mae(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))
enet_rsq <- rsq(books_test, truth = avg_book_rating, estimate = as.vector(enet_preds_test))

Tree OUT-OF-BAG Predictions…

books_right_join <- right_join(books_corti, books_boot_preds)
## Joining, by = c("bookID", "title", "author", "avg_book_rating", "isbn", "language_code", "num_pages", "book_ratings_count", "text_reviews_count", "publication_date", "publisher", "title_word_count", "title_sentiment_avg", "authorworkcount", "author_fans", "gender", "born", "died", "avg_author_rating", "author_ratings_count", "author_review_count", "genre")
books_right_join <- books_right_join %>% ungroup()

tree_rmse <- rmse(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_mae <- mae(books_right_join, truth = avg_book_rating, estimate = preds_bag)
tree_rsq <- rsq(books_right_join, truth = avg_book_rating, estimate = preds_bag)

Random Forest OUT-OF-BAG Predictions…

preds_OOB <- predict(rf_fit)

rf_rsq <- rsq(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_rmse <- rmse(books_corti, truth = avg_book_rating, estimate = preds_OOB)
rf_mae <- mae(books_corti, truth = avg_book_rating, estimate = preds_OOB)

22) MERGING – RSQ, RMSE & MAE COMBINED DATA FRAME

All testing data predictions…

rsq_DF <- merge(rf_rsq, enet_rsq, by=c(".metric", ".estimator")) 
                           
rsq_DF1 <- merge(rsq_DF, lm_rsq, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate) 

rsq_DF2 <- merge(rsq_DF1, tree_rsq, by=c(".metric", ".estimator")) %>% select(-.estimator)

print(rsq_DF2)
##   .metric Random Forest ElasticNet     Linear .estimate
## 1     rsq       0.24535 0.06579101 0.06588606 0.1084553
rmse_DF <- merge(rf_rmse, enet_rmse, by=c(".metric", ".estimator"))

rmse_DF1 <- merge(rmse_DF, lm_rmse, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate) 

rmse_DF2 <- merge(rmse_DF1, tree_rmse, by=c(".metric", ".estimator")) %>% select(-.estimator)

print(rmse_DF2)
##   .metric Random Forest ElasticNet    Linear .estimate
## 1    rmse     0.2452216  0.2633069 0.2633021 0.2692242
mae_DF <- merge(rf_mae, enet_mae, by=c(".metric", ".estimator"))

mae_DF1 <- merge(mae_DF, lm_mae, by=c(".metric", ".estimator")) %>% rename("Random Forest" = .estimate.x, "ElasticNet" = .estimate.y, "Linear" = .estimate) 

mae_DF2 <- merge(mae_DF1, tree_mae, by=c(".metric", ".estimator")) %>% select(-.estimator)

print(mae_DF2)
##   .metric Random Forest ElasticNet    Linear .estimate
## 1     mae     0.1795154  0.2016244 0.2016341 0.2051364
total <- rbind(rsq_DF2, rmse_DF2)

final <-rbind(total, mae_DF2) %>% rename("Tree" = .estimate, "Metrics" = .metric) 

print(final)
##   Metrics Random Forest ElasticNet     Linear      Tree
## 1     rsq     0.2453500 0.06579101 0.06588606 0.1084553
## 2    rmse     0.2452216 0.26330694 0.26330206 0.2692242
## 3     mae     0.1795154 0.20162440 0.20163410 0.2051364

23) METRICS DATA TABLE

Credit for the code below: https://rfortherestofus.com/2019/11/how-to-make-beautiful-tables-in-r/

Need to load ‘kableExtra’ library.

final %>% kable() %>% kable_styling()
Metrics Random Forest ElasticNet Linear Tree
rsq 0.2453500 0.0657910 0.0658861 0.1084553
rmse 0.2452216 0.2633069 0.2633021 0.2692242
mae 0.1795154 0.2016244 0.2016341 0.2051364

Credit for code below: https://www.littlemissdata.com/blog/prettytables

Need to load ‘formattable’, ‘tidyr’, and ‘data.table’ libraries.

custom_one = "#CCCCFF"
custom_two = "skyblue"
custom_three = "#4ec5a5"
custom_coral = "#FA7268"
# custom_green = "#00AD43"

formattable(final, 
            align =c("l","c","c","c","c", "c", "c", "c", "r"), 
            list(`Metrics` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold")), 
                `Random Forest`= color_tile(custom_one, custom_one), 
              `ElasticNet`= color_tile(custom_two, custom_two), 
              `Linear`= color_tile(custom_three, custom_three), 
              `Tree`= color_tile(custom_coral, custom_coral)
))
Metrics Random Forest ElasticNet Linear Tree
rsq 0.2453500 0.06579101 0.06588606 0.1084553
rmse 0.2452216 0.26330694 0.26330206 0.2692242
mae 0.1795154 0.20162440 0.20163410 0.2051364